library(MASS)
library(car)
## Loading required package: carData
credit_data <- read.csv("credit_card_data.csv")
set.seed(7)
train_index <- sample(1:nrow(credit_data), size = nrow(credit_data) * 0.7)
train_data <- credit_data[train_index, ]
test_data <- credit_data[-train_index, ]
head(train_data) #923 rows
## card reports age income share expenditure owner selfemp
## 476 yes 0 21.50000 2.3779 0.0179990800 35.16667 no no
## 706 yes 0 23.41667 1.8600 0.2670521000 413.59750 no no
## 218 yes 0 29.16667 2.5000 0.0767256000 159.59500 no no
## 630 yes 0 24.08333 1.6200 0.0105080200 13.68583 no no
## 1016 no 0 30.08333 3.1200 0.0003846154 0.00000 no no
## 835 yes 1 36.33333 7.3500 0.0712285700 436.27500 no no
## dependents months majorcards active
## 476 0 7 0 5
## 706 0 3 1 4
## 218 0 7 1 21
## 630 0 12 0 4
## 1016 2 12 0 5
## 835 0 51 0 14
# head(test_data) #396 rows
model1 <- lm(expenditure ~ income + share + dependents + months + active, data = train_data)
summary(model1)
##
## Call:
## lm(formula = expenditure ~ income + share + dependents + months +
## active, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -512.80 -29.22 3.86 28.43 1072.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -129.64194 8.34869 -15.528 <2e-16 ***
## income 41.34601 2.03885 20.279 <2e-16 ***
## share 2375.97225 35.16316 67.570 <2e-16 ***
## dependents 6.37492 2.73331 2.332 0.0199 *
## months -0.09740 0.04771 -2.041 0.0415 *
## active 0.78291 0.51004 1.535 0.1251
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96.01 on 917 degrees of freedom
## Multiple R-squared: 0.8403, Adjusted R-squared: 0.8395
## F-statistic: 965.4 on 5 and 917 DF, p-value: < 2.2e-16
Evaluate whether the predictor exist curvature through residual plots.
res <- residuals(model1)
# expenditure
plot(train_data$expenditure, res,
xlab = "income",
ylab = "Residuals",
main = "Residuals vs expenditure")
abline(h = 0, col = "red")
# income
plot(train_data$income, res,
xlab = "income",
ylab = "Residuals",
main = "Residuals vs income")
abline(h = 0, col = "red")
# share
plot(train_data$share, res,
xlab = "share",
ylab = "Residuals",
main = "Residuals vs share")
abline(h = 0, col = "red")
# dependents
plot(train_data$dependents, res,
xlab = "dependents",
ylab = "Residuals",
main = "Residuals vs dependents")
abline(h = 0, col = "red")
# months
plot(train_data$months, res,
xlab = "months",
ylab = "Residuals",
main = "Residuals vs months")
abline(h = 0, col = "red")
# active
plot(train_data$active, res,
xlab = "active",
ylab = "Residuals",
main = "Residuals vs active")
abline(h = 0, col = "red")
From the performance of residual plots, there are curvature existed in the variable income, share, and month.
# create a reusable function to plot residual vs fitted plot
plot_diagnostics <- function(model, title = NULL) {
res <- residuals(model)
fit <- fitted(model)
# Residuals vs Fitted
plot(fit, res,
xlab = "Fitted Values",
ylab = "Residuals",
main = ifelse(is.null(title),
"Residuals vs Fitted",
paste(title, "- Residuals vs Fitted")))
abline(h = 0, col = "red", lwd = 2)
# Q-Q Plot
qqnorm(res,
main = ifelse(is.null(title),
"Normal Q-Q Plot",
paste(title, "- Q-Q Plot")))
qqline(res, col = "red", lwd = 2)
}
plot(model1, which=1) # residuals vs fitted
qqnorm(resid(model1)); qqline(resid(model1)) # normality
# the residual scatter plot shows a cone shape with high variance. This means that the residuals exponentially increases as the values increase. There is also a broad trend curve exists. A transformation is required.
# Polynomial terms
train_data$months_c <- train_data$months - mean(train_data$months)
train_data$months_c2 <- train_data$months_c^2
train_data$income2 <- train_data$income^2
train_data$share2 <- train_data$share^2
train_data$months2 <- train_data$months^2
# income + share
model2 <- lm(expenditure ~ income + income2 + share + share2 + months + dependents + active, data = train_data)
summary(model2)
##
## Call:
## lm(formula = expenditure ~ income + income2 + share + share2 +
## months + dependents + active, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -351.46 -36.67 -4.28 30.25 1091.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -228.8530 14.5803 -15.696 < 2e-16 ***
## income 88.1860 6.8354 12.901 < 2e-16 ***
## income2 -4.7251 0.6604 -7.155 1.71e-12 ***
## share 2740.3686 64.5730 42.438 < 2e-16 ***
## share2 -921.9591 140.9000 -6.543 1.00e-10 ***
## months -0.1040 0.0454 -2.291 0.0222 *
## dependents 4.1192 2.6248 1.569 0.1169
## active 0.4368 0.4869 0.897 0.3699
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 91.35 on 915 degrees of freedom
## Multiple R-squared: 0.8558, Adjusted R-squared: 0.8547
## F-statistic: 775.6 on 7 and 915 DF, p-value: < 2.2e-16
# adj r^2: 0.8547
# all quadratic terms
model3 <- lm(expenditure ~ income + income2 + share + share2 + months + months2 + dependents + active, data = train_data)
summary(model3)
##
## Call:
## lm(formula = expenditure ~ income + income2 + share + share2 +
## months + months2 + dependents + active, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -350.81 -36.44 -4.13 30.53 1092.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.295e+02 1.464e+01 -15.681 < 2e-16 ***
## income 8.794e+01 6.852e+00 12.833 < 2e-16 ***
## income2 -4.698e+00 6.625e-01 -7.091 2.66e-12 ***
## share 2.740e+03 6.460e+01 42.421 < 2e-16 ***
## share2 -9.211e+02 1.410e+02 -6.535 1.06e-10 ***
## months -5.487e-02 9.932e-02 -0.552 0.581
## months2 -1.757e-04 3.158e-04 -0.556 0.578
## dependents 3.955e+00 2.642e+00 1.497 0.135
## active 4.201e-01 4.880e-01 0.861 0.390
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 91.39 on 914 degrees of freedom
## Multiple R-squared: 0.8558, Adjusted R-squared: 0.8546
## F-statistic: 678.2 on 8 and 914 DF, p-value: < 2.2e-16
# adj r^2: 0.8546
# transformation
# model4 <- lm(expenditure ~ log_income + log_share + dependents + months_c + months_c2 + active, data = train_data)
# summary(model4)
# # adj r^2: 0.8523
#
# # all sqrt (including polynomial months on sqrt scale)
# model5 <- lm(expenditure ~ log_income + log_share + dependents + sqrt_months_c + sqrt_months_c2 + active, data = train_data)
# summary(model5)
# # adj r^2: 0.8524
# compare the adjusted R^2 for the models that performed better
cat("Adjusted R^2: \nModel 1: ", summary(model1)$adj.r.squared, "\nModel 2: ", summary(model2)$adj.r.squared,
"\nModel 3: ", summary(model3)$adj.r.squared)
## Adjusted R^2:
## Model 1: 0.8394792
## Model 2: 0.8546664
## Model 3: 0.8545566
# compare the VIF for multicollinearity
cat("\nVIF: \nModel 1: ")
##
## VIF:
## Model 1:
print(vif(model1), type = "predictor")
## income share dependents months active
## 1.180984 1.015459 1.148239 1.036684 1.034308
cat ("\nModel 2: ")
##
## Model 2:
print(vif(model2), type = "predictor")
## income income2 share share2 months dependents active
## 14.660940 14.031714 3.782285 3.770604 1.036922 1.169552 1.041060
cat("\nModel 3: ")
##
## Model 3:
print(vif(model3), type = "predictor")
## income income2 share share2 months months2 dependents
## 14.723042 14.109987 3.782295 3.771032 4.959193 4.889934 1.184302
## active
## 1.045025
# residual plot
plot(model1, which=1) # residuals vs fitted
qqnorm(resid(model1)); qqline(resid(model1)) # normality
plot(model2, which=1) # residuals vs fitted
qqnorm(resid(model2)); qqline(resid(model2)) # normality
plot(model3, which=1) # residuals vs fitted
qqnorm(resid(model3)); qqline(resid(model3)) # normality
After checking for curvature in the predictors, we found evidence of nonlinear patterns in income, share, and months, so we explored adding quadratic terms instead of applying transformations at this stage. Among all fitted curvature models, the two best-performing ones were: (1) the model including quadratic terms for income and share, and (2) the model including quadratic terms for income, share, and months. Although the residual-versus-fitted and Q–Q plots did not show substantial visual improvement over the baseline model (only a little bit), these two models achieved the highest adjusted R^2 values among all curvature combinations and surpassed the baseline model (model 1). Therefore, these two curvature-enhanced models are retained for further consideration in the next modeling stage.
Each member is focusing on different predictors, and I was assigned to explore more about the potentials of variable \(months\).
#fit model with different combination of interaction with months
model4 <- lm(expenditure ~ income + share + dependents * months + active, data = train_data)
model5 <- lm(expenditure ~ income + share * months + dependents + active, data = train_data)
model6 <- lm(expenditure ~ income * months + share + dependents + active, data = train_data)
model7 <- lm(expenditure ~ income + share + dependents + active * months, data = train_data)
cat("Adjusted R^2: \nModel 4: ", summary(model4)$adj.r.squared,
"\nModel 5: ", summary(model5)$adj.r.squared,
"\nModel 6: ", summary(model6)$adj.r.squared,
"\nModel 7: ", summary(model7)$adj.r.squared)
## Adjusted R^2:
## Model 4: 0.8394281
## Model 5: 0.8398366
## Model 6: 0.8412528
## Model 7: 0.8393493
# Use F test to see whether the interaction term is statistically significant
anova(model1, model4)
## Analysis of Variance Table
##
## Model 1: expenditure ~ income + share + dependents + months + active
## Model 2: expenditure ~ income + share + dependents * months + active
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 917 8452561
## 2 916 8446032 1 6529.8 0.7082 0.4003
anova(model1, model5)
## Analysis of Variance Table
##
## Model 1: expenditure ~ income + share + dependents + months + active
## Model 2: expenditure ~ income + share * months + dependents + active
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 917 8452561
## 2 916 8424545 1 28016 3.0462 0.08126 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model1, model6)
## Analysis of Variance Table
##
## Model 1: expenditure ~ income + share + dependents + months + active
## Model 2: expenditure ~ income * months + share + dependents + active
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 917 8452561
## 2 916 8350052 1 102509 11.245 0.000831 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model1, model7)
## Analysis of Variance Table
##
## Model 1: expenditure ~ income + share + dependents + months + active
## Model 2: expenditure ~ income + share + dependents + active * months
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 917 8452561
## 2 916 8450173 1 2388.3 0.2589 0.611
# plot the residual vs fitted model and qq plot to see any improvements
plot(model4, which=1) # residuals vs fitted
qqnorm(resid(model4)); qqline(resid(model4)) # normality
plot(model5, which=1) # residuals vs fitted
qqnorm(resid(model5)); qqline(resid(model5)) # normality
plot(model6, which=1) # residuals vs fitted
qqnorm(resid(model6)); qqline(resid(model6)) # normality
plot(model7, which=1) # residuals vs fitted
qqnorm(resid(model7)); qqline(resid(model7)) # normality
Based on the F-tests comparing models 4–7 against model 1, models 4 and 7 do not show significant improvement, while models 5 and 6 are statistically significant. Although the residual diagnostics did not show noticeable changes across these models, model 6 achieves the highest adjusted R^2 among models 4–7 and provides a statistically significant improvement over model 1. Therefore, we retain the interaction term \(income * months\) for further consideration. [specify income * months]
n <- nrow(train_data)
n
## [1] 923
Because the train dataset contains 923 observations, the sample size is more than sufficient to support models that include quadratic and interaction terms. When evaluating multicollinearity using VIF across all expanded models so far, none of the predictors, including quadratic and interaction terms, exceeded a VIF of 5. This indicates that the models are stable, and there is no evidence of harmful multicollinearity. Since (1) the sample size is large, (2) the expanded models do not overload the data, and (3) multicollinearity remains at acceptable levels, variable screening is not required at this stage. You can safely proceed to the next modeling step using the full set of first-order predictors along with the candidate curvature and interaction terms identified earlier.
# Log transforms
train_data$log_income <- log(train_data$income)
train_data$log_share <- log(train_data$share + 1) # +1 if share has zeros
train_data$log_months <- log(train_data$months)
# Sqrt transforms
train_data$sqrt_income <- sqrt(train_data$income)
train_data$sqrt_share <- sqrt(train_data$share)
train_data$sqrt_months <- sqrt(train_data$months)
# Centering log transforms
train_data$log_income_c <- train_data$log_income - mean(train_data$log_income)
train_data$log_income_c2 <- train_data$log_income_c^2
train_data$log_share_c <- train_data$log_share - mean(train_data$log_share)
train_data$log_share_c2 <- train_data$log_share_c^2
train_data$log_months_c <- train_data$log_months - mean(train_data$log_months)
train_data$log_months_c2 <- train_data$log_months_c^2
# Centering sqrt transforms
train_data$sqrt_income_c <- train_data$sqrt_income - mean(train_data$sqrt_income)
train_data$sqrt_income_c2 <- train_data$sqrt_income_c^2
train_data$sqrt_share_c <- train_data$sqrt_share - mean(train_data$sqrt_share)
train_data$sqrt_share_c2 <- train_data$sqrt_share_c^2
train_data$sqrt_months_c <- train_data$sqrt_months - mean(train_data$sqrt_months)
train_data$sqrt_months_c2 <- train_data$sqrt_months_c^2
Based on previous analyses from last two weeks, we found that all variables that we are using, including both predictors and the response, are right-skewed. Furthermore, in part 2, we identified four variables, which are expenditure, income, share, and months, that exhibit curvature, violating the assumptions of normality and constant variance. Therefore, it is necessary to perform a transformation. Regarding Y, the expenditure, I will use Box-Cox transformation to assess whether transforming the response variable can improve these assumptions. Regarding X, I will explore potential transformations (such as log or square-root) for the three predictor variables.
# box cox requires variables to be all positive, so we use log expenditure squared
bc <- boxcox(lm((expenditure+1) ~ income + share + dependents + months + active, data=train_data))
lambda <- bc$x[which.max(bc$y)]
lambda # 0.5050505
## [1] 0.5050505
To address the issue of negative values existed in expenditure (Y), I applied Box-Cox to expenditure + 1 and found that the optimal lambda is 0.505. This result indicates that a square-root transformation of the response variable is most appropriate for stabilizing variance and improving the normality of residuals.
train_data$sqrt_expenditure <- (train_data$expenditure + 1)^0.505
model_trans <- lm(sqrt_expenditure ~ income + share + dependents + months + active, data=train_data)
plot(model_trans, which=1, main="Residuals vs Fitted (Transformed Y)")
qqnorm(resid(model_trans)); qqline(resid(model_trans), main="QQ Plot (Transformed Y)")
The plot diagnosises show that transform Y makes the situation worse. there is still a clear curvature exist in the residual vs fitted plot, and even more deviations in the QQ plot.
First-order terms: expenditure ~ income + share + dependents + months + active
Centered polynomial terms: combination of \(income^2 + share^2 + share^2\)
Interaction terms: \(income \*months\)
Transformed predictors: \(share\)
transformed response: \(sqrt(expenditure)\)
# Interaction term, use the centered variable
train_data$income_c <- train_data$income - mean(train_data$income)
train_data$share_c <- train_data$share - mean(train_data$share)
train_data$income_months_share <- train_data$income_c * train_data$months * train_data$share_c
# Centered polynomial terms
train_data$income_c2 <- train_data$income_c^2
train_data$log_share_c2 <- train_data$log_share_c^2
refined_model3 <- lm(sqrt_expenditure ~ income_c + income_c2 + sqrt_share_c + log_share_c2 +
months + income_months_share + dependents + active,
data = train_data) # adjusted R^2: 0.971
summary(refined_model3)
##
## Call:
## lm(formula = sqrt_expenditure ~ income_c + income_c2 + sqrt_share_c +
## log_share_c2 + months + income_months_share + dependents +
## active, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4202 -0.5621 -0.0057 0.6437 10.5039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.120e+01 9.856e-02 113.604 < 2e-16 ***
## income_c 1.672e+00 4.458e-02 37.512 < 2e-16 ***
## income_c2 -7.415e-02 1.080e-02 -6.864 1.23e-11 ***
## sqrt_share_c 5.241e+01 3.531e-01 148.416 < 2e-16 ***
## log_share_c2 -1.254e+01 3.097e+00 -4.050 5.56e-05 ***
## months 2.540e-03 7.475e-04 3.398 0.000709 ***
## income_months_share 7.872e-02 3.966e-03 19.850 < 2e-16 ***
## dependents 3.530e-02 4.136e-02 0.853 0.393634
## active 2.392e-03 7.651e-03 0.313 0.754633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.435 on 914 degrees of freedom
## Multiple R-squared: 0.9712, Adjusted R-squared: 0.971
## F-statistic: 3853 on 8 and 914 DF, p-value: < 2.2e-16
cat("The final refined model:\nAdjusted R^2: ", summary(refined_model3)$adj.r.square)
## The final refined model:
## Adjusted R^2: 0.9709521
cat("\nVIF: \n")
##
## VIF:
vif(refined_model3)
## income_c income_c2 sqrt_share_c log_share_c2
## 2.526913 2.403627 1.392025 1.413630
## months income_months_share dependents active
## 1.138728 1.280959 1.176237 1.041303
# Check residuals
plot(refined_model3, which=1) # residuals vs fitted
qqnorm(resid(refined_model3)); qqline(resid(refined_model3)) # normality
test_data$income_c <- test_data$income - mean(train_data$income)
test_data$share_c <- test_data$share - mean(train_data$income)
test_data$income_months_share <- test_data$income_c * test_data$months * test_data$share_c
# Centered polynomial terms
test_data$income_c2 <- test_data$income_c^2
test_data$log_share <- log(test_data$share + 1) # +1 if share has zeros
test_data$log_share_c <- test_data$log_share - mean(train_data$log_share)
test_data$log_share_c2 <- test_data$log_share_c^2
test_data$sqrt_share <- sqrt(test_data$share)
test_data$sqrt_share_c <- test_data$sqrt_share - mean(train_data$sqrt_share)
pred_test <- predict(refined_model3, newdata = test_data)
RMSE_pred <- sqrt(mean((test_data$expenditure - pred_test)^2))
RMSE_pred
## [1] 389.095
final_model <- lm(expenditure ~ income + sqrt(share) + dependents + months + active + share:dependents,
data = train_data)
pred_test <- predict(final_model, newdata = test_data)
# RMSE: square root of mean squared errors
RMSE <- sqrt(mean((test_data$expenditure - pred_test)^2))
RMSE
## [1] 181.2567
The RMSE of the final model using the test data to fit is 389.095, which is extremely large. This result indicates that the great performance of the final model, which the adjusted R^2 is 0.971, could possibly be a overfitting situation. The refined model learned all the noises and overly accomandate to the data, eventually caused the high variance. However, the RMSE of the final model using the training data [help]